import requests
from bs4 import BeautifulSoup
from src.db_redis import RedisClient


class WeatherSpider:
    def __init__(self):
        self.index_url = "http://lishi.tianqi.com/"
        self.dao = RedisClient()

    # 分析首页
    def get_index(self):
        index_page = self.get_page(self.index_url)
        soup = BeautifulSoup(index_page, "html.parser")
        aList = []
        for i in range(ord("A"), ord("Z") + 1):
            aList += soup.select("#city-{} > td > ul > li > a".format(chr(i)))
        for city in aList:
            self.get_city(self.index_url + city.get("href"))

    # 分析城市页面
    def get_city(self, url):
        city_page = self.get_page(url)
        soup = BeautifulSoup(city_page, "html.parser")
        aList = soup.select(".optionbox > select > option")[1:]
        if aList is None or len(aList) == 0:
            return
        for month in aList:
            # 用redis缓存爬取的链接并去重，构成生产者消费者模式，避免爬取过程中的数据丢失
            self.dao.save_link(url.replace("index", month.get("value")))

    # 分析具体天气页面
    def get_detail(self):
        url = self.dao.get_link()
        detail_page = self.get_page(url)
        if detail_page is None:
            return
        soup = BeautifulSoup(detail_page, "html.parser")
        days = soup.select(".lishitable_content > li")[:-1]
        if len(days) == 0:
            days = soup.select(".thrui > li")
        city = soup.select(".crumbs > h4")[0].text[14:][:-4]
        for day in days:
            try:
                div = day.select("div")
                if div[0].a is None:
                    date = div[0].text.split("-")
                else:
                    date = div[0].a.text.split("-")
                Y = date[0]
                M = date[1]
                D = date[2]
                if len(D) > 3:
                    D = D[:2]
                top_temperature = div[1].text.replace("℃", "")
                low_temperature = div[2].text.replace("℃", "")
                weather = div[3].text
                self.dao.set(city, "[{},{},{},{}, {}, '{}']".format(Y, int(M), int(D), top_temperature, low_temperature,
                                                                    weather))
            except Exception as e:
                print(e)
                continue

    # 通用网页信息获取工具
    def get_page(self, url):
        try:
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"}
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            print('抓取成功', url, response.status_code)
            return response.text
        except Exception as e:
            print(e)
            print('抓取失败', url)
            return None

    # 爬取所有
    def crawl_all(self):
        self.get_index()
        for i in range(len(self.dao.list_all("links"))):
            self.get_detail()

    # 爬取对应城市
    def crawl_city(self, city):
        index_page = self.get_page(self.index_url)
        soup = BeautifulSoup(index_page, "html.parser")
        aList = []
        for i in range(ord("A"), ord("Z") + 1):
            aList += soup.select("#city-{} > td > ul > li > a".format(chr(i)))
        for c in aList:
            if c.text == city:
                city_url = self.index_url + c.get("href")
        self.get_city(city_url)
        for i in range(len(self.dao.list_all("links"))):
            self.get_detail()
